############################################################################
############################################################################
# Replication File for
# How Market Dynamics of Domestic and Foreign Social Media Firms 
# Shape Strategies of Internet Censorship
#
# Jennifer Pan
# Updated: March 5, 2020
############################################################################

rm(list = ls())

# Load packages

library(stringr)
library(plyr)

# Load data
setwd(".")

top25 <- read.csv("alexa_top25_all_newvars.csv", sep=",", header=TRUE)
  # combines data from:
  # Geddes, Wright, Frantz "Autocratic Regimes" (downloaded 10/21/2014)
  # IMF GDP data (downloaded 6/22/2015)
  # Alexa top 25 most trafficked sites (collect 7/25/2016)

# Descriptive information
dim(table(top25$rank_countrycode, top25$regime))
  # 96 countries
table(table(top25$rank_countrycode, top25$regime)[,2] > 0)
  # 71 democratic, 25 authoritarian
length(unique(top25$name1))
  # 1146 unique domain names
length(unique(top25$name2))
  # 948 unique domains

# Table 1
top25.byname1 <- top25[!duplicated(top25$name1),]
top25.byname2 <- top25[!duplicated(top25$name2),]
top25.bysitetype <- data.frame(cbind(table(top25.byname1$sitetype), table(top25.byname2$sitetype)))
colnames(top25.bysitetype) <- c("domain.name", "domains")
top25.bysitetype$category <- c("Ads", "Social Media", "Social Media", "Company", "E-Commerce","Other", "Entertainment", "Entertainment", "Other", "Social Media", "Information", "Social Media", "News", "Social Media", "Company", "Other", "Portal", "Social Media", "Search", "Social Media", "Social Media", "News", "Social Media", "Other", "Unknown", "Social Media", "Entertainment")
top25.bycat <- ddply(top25.bysitetype, "category", function(df) c(domain.name = sum(df$domain.name), domains = sum(df$domains)))
top25.bycat$percent <- top25.bycat$domains / 948
top25.bycat[order(-top25.bycat$domain.name),]

# Table 2
top25_countryown <- merge(ddply(top25, .(sm_country), function(x)length(unique(x$name1))), data.frame(table(top25$sm_country)), by.x="sm_country", by.y = "Var1")
top25_countryown[order(-top25_countryown$Freq),]

# Figure 1 left
countrynames <- read.csv("countrynames.csv", sep=",", header=TRUE)
top25_country <- ddply(top25, .(rank_countrycode), summarize, 
  sm_ct = sum(sm),
  sm_pr = sum(sm)/25,
  sm_dom_ct = sum(sm_domestic, na.rm=T),
  sm_dom_pr = sum(sm_domestic, na.rm=T) / sum(sm),
  sm_us_ct = sum(sm_us, na.rm=T),
  sm_us_pr = sum(sm_us, na.rm=T) / sum(sm))

top25_country <- merge(top25_country, countrynames, by.x="rank_countrycode", by.y="countrycode")
top25_country$country <- as.character(top25_country$country)
top25_country <- merge(top25_country, unique(top25[c("rank_countrycode","regime")]), by.x="rank_countrycode", by.y="rank_countrycode")
top25_country$regime_col <- ifelse(top25_country$regime == "autocratic", "gray10", "gray70")

top25_c1 <- top25_country[order(top25_country$sm_dom_pr),]
par(mar=c(6.5,9.5,2,0.5)+0.1)
plot(top25_c1$sm_dom_pr, 1:length(top25_c1$sm_dom_pr), pch=16, cex=1,
  xlab="Proportion of social media sites\n owned by domestic firms", ylab="", yaxt="n",
  col=top25_c1$regime_col, cex.axis=1.5, cex.lab=1.5, ylim=c(3,93), mgp=c(4,1,0))
axis(2, at=1:length(top25_c1$sm_dom_pr), labels=top25_c1$country, las=2, cex.axis=1.5)
for(i in 1:length(top25_c1$sm_dom_pr)){
	segments(0,i,top25_c1$sm_dom_pr[i],i, col=top25_c1$regime_col[i], lwd=4)
}
legend(x="right", legend=c("autocratic","democratic"), col=c("gray10","gray70"), pch=16, cex=1.5)

# Figure 1 right
top25_c2 <- top25_country[order(top25_country$sm_us_pr),]
par(mar=c(6.5,9.5,2,0.5)+0.1)
plot(top25_c1$sm_us_pr, 1:length(top25_c1$sm_us_pr), pch=16, cex=1,
  xlab="Proportion of social media sites\n owned by U.S. firms", ylab="", yaxt="n",
  col=top25_c1$regime_col, cex.axis=1.5, cex.lab=1.5, ylim=c(3,93), mgp=c(4,1,0))
axis(2, at=1:length(top25_c1$sm_us_pr), labels=top25_c1$country, las=2, cex.axis=1.5)
for(i in 1:length(top25_c1$sm_us_pr)){
	segments(0,i,top25_c1$sm_us_pr[i],i, col=top25_c1$regime_col[i], lwd=4)
}


#
## Twitter
#

# Load data
twitter_bycountry1 <- read.csv("twitter.csv", header=T)

# Figure 2 left
par(mar=c(6.5,9.5,2,0.5)+0.1)
plot(twitter_bycountry1$removed_avg, 1:length(twitter_bycountry1$removed_avg), pch=16, cex=1,
  xlab="Average Government Content Removal\nApproval Proportion (2012 - 2014)", ylab="", yaxt="n", col=twitter_bycountry1$col, cex.axis=1.5, cex.lab=1.5, mgp=c(4,1,0), xlim=c(0,1))
axis(2, at=1:length(twitter_bycountry1$removed_avg), labels=twitter_bycountry1$Country, las=2, cex.axis=1.5)
for(i in 1:length(twitter_bycountry1$removed_avg)){
	segments(0,i,twitter_bycountry1$removed_avg[i],i, col=twitter_bycountry1$col[i], lwd=4)
}
legend(x="right", legend=c("autocratic","democratic"), col=c("gray10","gray70"), pch=16, cex=1.5)
abline(v=mean(twitter_bycountry1$removed, na.rm=T), lty="dashed")
text(0.1, 5, "10% Overall\ncompliance rate", pos=4, cex=1.5, col="black")

# Figure 2 right
par(mar=c(6.5,9,2,1)+0.1)
plot(twitter_bycountry1$requests, 1:length(twitter_bycountry1$requests), pch=16, cex=1,
  xlab="Number of Government Content Removal\nRequests (2012 - 2014)", ylab="", yaxt="n", col=twitter_bycountry1$col, cex.axis=1.5, cex.lab=1.5, mgp=c(4,1,0))
axis(2, at=1:length(twitter_bycountry1$removed_avg), labels=twitter_bycountry1$Country, las=2, cex.axis=1.5)
for(i in 1:length(twitter_bycountry1$requests)){
	segments(0,i,twitter_bycountry1$requests[i],i, col=twitter_bycountry1$col[i], lwd=4)
}


#
## Youtube
# 


# Load data
youtube <- read.csv("youtube-government-removal-outcome.csv")

table(youtube$Removed)/70
  # 50% of requests met with no removal of content, 44% some restrictions

yb.noiom <- youtube[youtube$IoM != 1,]
table(youtube[youtube$IoM != 1,]$Removed) / dim(yb.noiom)[1]
  # w/o innocence of muslims, 66% of requests no removal, 26% some restrictions

# Figure 3 left
youtube$Date <- as.Date(paste("01",paste(youtube$Period, youtube$Year, sep=" ")), format='%d %B %Y')
counts <- table(youtube$Removed, youtube$Date)
counts <- rbind(counts[2,], counts[3,], counts[1,])
rownames(counts) <- c("None", "Some", "All")
counts.cols <- c("gray88","gray50","gray8")

barplot(counts, legend = rownames(counts), xlab="Period", xaxt="n", ylab="Outcome of YouTube Removal Requests by Time", cex.lab=1.5, cex.axis=1.5, col=counts.cols)
axis(1, at=seq(0.5, 10, 1.25), labels=format(as.Date(colnames(counts)), "%b %y"), cex.axis=1.1)
text(6.2, 20, "20 Requests with\nSome Removal\nRelated to\n'Innocence of Muslims'", pos=2, cex=1.5)

# Figure 3 right
cts.country <- as.data.frame.matrix(table(youtube$Removed, youtube$Country))
cts.country <- rbind(cts.country, colSums(cts.country))
rownames(cts.country)[4] <- "Total"
cts.country <- cts.country[,order(cts.country[4,])]
cts.cty.plot <- as.matrix(cts.country[1:3,])
cts.cty.plot <- rbind(cts.cty.plot[2,], cts.cty.plot[3,], cts.cty.plot[1,])
rownames(cts.cty.plot) <- c("None", "Some", "All")

par(mar=c(6,8.5,2,0.5) + 0.1)
barplot(cts.cty.plot, horiz=T, las=2, legend=rownames(cts.cty.plot), args.legend=list(x=c("right"), bty="n", cex=1.5), xlab = "Outcome of Removal Requests by Country", mgp=c(3,0.5,0), xaxt="n", col=counts.cols, cex=1.1, cex.axis=1.5, cex.lab=1.5)
axis(1, mgp=c(3,1,0), cex.axis=1.5)

